# import all required libraries, dataset and print top 10 rows of the data
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
#initiate visualization library for jupyter notebook
pyoff.init_notebook_mode()
tx_data = pd.read_excel('OnlineRetail.xlsx')
tx_data.head(10)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-01-12 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-01-12 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-01-12 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 5 | 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-01-12 08:26:00 | 7.65 | 17850.0 | United Kingdom |
| 6 | 536365 | 21730 | GLASS STAR FROSTED T-LIGHT HOLDER | 6 | 2010-01-12 08:26:00 | 4.25 | 17850.0 | United Kingdom |
| 7 | 536366 | 22633 | HAND WARMER UNION JACK | 6 | 2010-01-12 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 8 | 536366 | 22632 | HAND WARMER RED POLKA DOT | 6 | 2010-01-12 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 9 | 536367 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 32 | 2010-01-12 08:34:00 | 1.69 | 13047.0 | United Kingdom |
#converting the type of Invoice Date Field from string to datetime.
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
#creating YearMonth field for the ease of reporting and visualization
tx_data['InvoiceYearMonth'] = tx_data['InvoiceDate'].map(lambda date: 100*date.year + date.month)
#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
tx_data['Revenue'] = tx_data['UnitPrice'] * tx_data['Quantity']
tx_revenue = tx_data.groupby(['InvoiceYearMonth'])['Revenue'].sum().reset_index()
tx_revenue
| InvoiceYearMonth | Revenue | |
|---|---|---|
| 0 | 201001 | 58635.560 |
| 1 | 201002 | 46207.280 |
| 2 | 201003 | 45620.460 |
| 3 | 201005 | 31383.950 |
| 4 | 201006 | 53860.180 |
| 5 | 201007 | 45059.050 |
| 6 | 201008 | 44189.840 |
| 7 | 201009 | 52532.130 |
| 8 | 201010 | 57404.910 |
| 9 | 201012 | 314063.660 |
| 10 | 201101 | 607748.110 |
| 11 | 201102 | 542511.540 |
| 12 | 201103 | 712059.510 |
| 13 | 201104 | 660149.571 |
| 14 | 201105 | 733769.320 |
| 15 | 201106 | 745252.340 |
| 16 | 201107 | 843891.231 |
| 17 | 201108 | 715632.720 |
| 18 | 201109 | 992555.452 |
| 19 | 201110 | 931852.160 |
| 20 | 201111 | 1297606.420 |
| 21 | 201112 | 215762.540 |
#X and Y axis inputs for Plotly graph. We use Scatter for line graphs
plot_data = [
go.Scatter(
x=tx_revenue['InvoiceYearMonth'],
y=tx_revenue['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#using pct_change() function to see monthly percentage change
tx_revenue['MonthlyGrowth'] = tx_revenue['Revenue'].pct_change()
#showing first 5 rows
tx_revenue.head()
#visualization - line graph
plot_data = [
go.Scatter(
x=tx_revenue.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['InvoiceYearMonth'],
y=tx_revenue.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['MonthlyGrowth'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Growth Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#creating a new dataframe with UK customers only
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = tx_uk.groupby('InvoiceYearMonth')['CustomerID'].nunique().reset_index()
#print the dataframe
tx_monthly_active
#plotting the output
plot_data = [
go.Bar(
x=tx_monthly_active.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['InvoiceYearMonth'],
y=tx_monthly_active.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['CustomerID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Active Customers'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create a new dataframe for no. of order by using quantity field
tx_monthly_sales = tx_uk.groupby('InvoiceYearMonth')['Quantity'].sum().reset_index()
#print the dataframe
tx_monthly_sales
#plot
plot_data = [
go.Bar(
x=tx_monthly_sales.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['InvoiceYearMonth'],
y=tx_monthly_sales.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['Quantity'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Total # of Order'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
# create a new dataframe for average revenue by taking the mean of it
tx_monthly_order_avg = tx_uk.groupby('InvoiceYearMonth')['Revenue'].mean().reset_index()
#print the dataframe
tx_monthly_order_avg
#plot the bar chart
plot_data = [
go.Bar(
x=tx_monthly_order_avg.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['InvoiceYearMonth'],
y=tx_monthly_order_avg.query("InvoiceYearMonth < 201112 & InvoiceYearMonth > 201012")['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Order Average'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We have looked at our major metrics. Of course there are many more and it varies across industries. Let’s continue investigating some other important metrics:
#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = tx_uk.groupby('CustomerID').InvoiceDate.min().reset_index()
tx_min_purchase.columns = ['CustomerID','MinPurchaseDate']
tx_min_purchase['MinPurchaseYearMonth'] = tx_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)
#merge first purchase date column to our main dataframe (tx_uk)
tx_uk = pd.merge(tx_uk, tx_min_purchase, on='CustomerID')
tx_uk.head()
#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
tx_uk['UserType'] = 'New'
tx_uk.loc[tx_uk['InvoiceYearMonth']>tx_uk['MinPurchaseYearMonth'],'UserType'] = 'Existing'
#calculate the Revenue per month for each user type
tx_user_type_revenue = tx_uk.groupby(['InvoiceYearMonth','UserType'])['Revenue'].sum().reset_index()
#filtering the dates and plot the result
tx_user_type_revenue = tx_user_type_revenue.query("InvoiceYearMonth != 201012 and InvoiceYearMonth != 201112")
plot_data = [
go.Scatter(
x=tx_user_type_revenue.query("InvoiceYearMonth >= 201012 & UserType == 'Existing' ")['InvoiceYearMonth'],
y=tx_user_type_revenue.query("InvoiceYearMonth >= 201012 & UserType == 'Existing'")['Revenue'],
name = 'Existing'
),
go.Scatter(
x=tx_user_type_revenue.query("UserType == 'New' & InvoiceYearMonth >= 201012")['InvoiceYearMonth'],
y=tx_user_type_revenue.query("UserType == 'New' & InvoiceYearMonth >= 201012")['Revenue'],
name = 'New'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New vs Existing'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
tx_user_ratio = tx_uk.query("UserType == 'New'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()/tx_uk.query("UserType == 'Existing'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()
#print the dafaframe
tx_user_ratio
#plot the result
plot_data = [
go.Bar(
x=tx_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=tx_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['CustomerID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New Customer Ratio'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#identify which users are active by looking at their revenue per month
tx_user_purchase = tx_uk.groupby(['CustomerID','InvoiceYearMonth'])['Revenue'].sum().reset_index()
#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
tx_retention.head()
#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['InvoiceYearMonth'] = int(selected_month)
retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)
#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']
#plot the retention rate graph
plot_data = [
go.Scatter(
x=tx_retention.query("InvoiceYearMonth<201112 & InvoiceYearMonth>=201102")['InvoiceYearMonth'],
y=tx_retention.query("InvoiceYearMonth<201112 & InvoiceYearMonth>=201102")['RetentionRate'],
name="organic"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Retention Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#create our retention table again with crosstab() and add firs purchase year month view
tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
tx_retention = pd.merge(tx_retention,tx_min_purchase[['CustomerID','MinPurchaseYearMonth']],on='CustomerID')
new_column_names = [ 'm_' + str(column) for column in tx_retention.columns[:-1]]
new_column_names.append('MinPurchaseYearMonth')
tx_retention.columns = new_column_names
#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
retention_data = {}
selected_month = months[i]
prev_months = months[:i]
next_months = months[i+1:]
for prev_month in prev_months:
retention_data[prev_month] = np.nan
total_user_count = tx_retention[tx_retention.MinPurchaseYearMonth == selected_month].MinPurchaseYearMonth.count()
retention_data['TotalUserCount'] = total_user_count
retention_data[selected_month] = 1
query = "MinPurchaseYearMonth == {}".format(selected_month)
for next_month in next_months:
new_query = query + " and {} > 0".format(str('m_' + str(next_month)))
retention_data[next_month] = np.round(tx_retention.query(new_query)['m_' + str(next_month)].sum()/total_user_count,2)
retention_array.append(retention_data)
tx_retention = pd.DataFrame(retention_array)
tx_retention.index = months
#showing new cohort based retention table
tx_retention
| TotalUserCount | 201002 | 201003 | 201005 | 201006 | 201007 | 201008 | 201009 | 201010 | 201012 | ... | 201103 | 201104 | 201105 | 201106 | 201107 | 201108 | 201109 | 201110 | 201111 | 201112 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| InvoiceYearMonth | |||||||||||||||||||||
| 201002 | 105 | 1.0 | 0.01 | 0.00 | 0.03 | 0.07 | 0.05 | 0.11 | 0.04 | 0.30 | ... | 0.30 | 0.35 | 0.34 | 0.39 | 0.44 | 0.30 | 0.37 | 0.41 | 0.49 | 0.16 |
| 201003 | 38 | NaN | 1.00 | 0.03 | 0.03 | 0.03 | 0.03 | 0.08 | 0.00 | 0.29 | ... | 0.39 | 0.34 | 0.47 | 0.37 | 0.39 | 0.37 | 0.26 | 0.55 | 0.42 | 0.13 |
| 201005 | 63 | NaN | NaN | 1.00 | 0.05 | 0.02 | 0.02 | 0.05 | 0.02 | 0.35 | ... | 0.51 | 0.43 | 0.44 | 0.49 | 0.43 | 0.48 | 0.46 | 0.51 | 0.59 | 0.30 |
| 201006 | 72 | NaN | NaN | NaN | 1.00 | 0.03 | 0.06 | 0.06 | 0.01 | 0.36 | ... | 0.31 | 0.39 | 0.46 | 0.39 | 0.49 | 0.33 | 0.43 | 0.33 | 0.50 | 0.06 |
| 201007 | 51 | NaN | NaN | NaN | NaN | 1.00 | 0.04 | 0.04 | 0.02 | 0.16 | ... | 0.41 | 0.35 | 0.51 | 0.43 | 0.43 | 0.35 | 0.39 | 0.35 | 0.55 | 0.12 |
| 201008 | 83 | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.04 | 0.02 | 0.35 | ... | 0.40 | 0.37 | 0.42 | 0.39 | 0.39 | 0.43 | 0.40 | 0.47 | 0.58 | 0.25 |
| 201009 | 71 | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.01 | 0.25 | ... | 0.38 | 0.28 | 0.35 | 0.32 | 0.35 | 0.28 | 0.31 | 0.41 | 0.37 | 0.11 |
| 201010 | 36 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.31 | ... | 0.39 | 0.47 | 0.22 | 0.36 | 0.25 | 0.33 | 0.36 | 0.39 | 0.42 | 0.17 |
| 201012 | 261 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | ... | 0.33 | 0.31 | 0.35 | 0.38 | 0.33 | 0.31 | 0.36 | 0.30 | 0.39 | 0.13 |
| 201101 | 530 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.25 | 0.25 | 0.28 | 0.32 | 0.30 | 0.24 | 0.32 | 0.35 | 0.39 | 0.15 |
| 201102 | 374 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 0.22 | 0.20 | 0.23 | 0.27 | 0.28 | 0.25 | 0.28 | 0.28 | 0.31 | 0.09 |
| 201103 | 382 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.00 | 0.18 | 0.21 | 0.21 | 0.23 | 0.21 | 0.21 | 0.24 | 0.27 | 0.06 |
| 201104 | 297 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 1.00 | 0.21 | 0.19 | 0.19 | 0.19 | 0.21 | 0.29 | 0.24 | 0.05 |
| 201105 | 248 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | 1.00 | 0.19 | 0.20 | 0.18 | 0.24 | 0.17 | 0.25 | 0.06 |
| 201106 | 241 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 1.00 | 0.17 | 0.17 | 0.21 | 0.19 | 0.29 | 0.08 |
| 201107 | 205 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 1.00 | 0.19 | 0.19 | 0.21 | 0.27 | 0.03 |
| 201108 | 143 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.16 | 0.24 | 0.26 | 0.05 |
| 201109 | 216 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.15 | 0.20 | 0.02 |
| 201110 | 222 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.19 | 0.04 |
| 201111 | 201 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.05 |
| 201112 | 20 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 |
21 rows × 22 columns